Single cell RNA-seq of circulating breast cancer cells from
19 women with ER+/HER2- primary tumours. Analysis of 74 single candidate circultaing tumor cells (CTCs) from two representative ER+/HER2- patients (BRx-42 and BRx-82) and 14 triple negative patients were isolated via micromanipulation and subjected to single-cell RNA-sequencing. The goal is to distinguish groups of HER2- and HER2+ CTCs to look at drug susceptibility and targeted/combination based therapies. The method used for analysis was Seurat, an R toolkit for single cell genomics.
Citations:
Jordan, Nicole Vincent et al. “HER2 expression identifies dynamic functional states within circulating breast cancer cells.” Nature vol. 537,7618 (2016): 102-106.
Butler, A., Hoffman, P., Smibert, P. et al. "Integrating single-cell transcriptomic data across different conditions, technologies, and species." Nat Biotechnol 36, 411–420 (2018).
Loading dataset
library(Seurat)
library(dplyr)
library(patchwork)
library(Matrix)
library(readr)
library(clusterProfiler)
library(org.Hs.eg.db)
library(plyr)
data <- readMM("/Users/ericabrophy/Documents/BMI511_translational/BreastCancerDataset/E-GEOD-75367.aggregated_filtered_counts.mtx")
data_col <- read_delim("/Users/ericabrophy/Documents/BMI511_translational/BreastCancerDataset/E-GEOD-75367.aggregated_filtered_counts.mtx_cols",
"\t", escape_double = FALSE, col_names = FALSE,
trim_ws = TRUE)
data_row <- read_delim("/Users/ericabrophy/Documents/BMI511_translational/BreastCancerDataset/E-GEOD-75367.aggregated_filtered_counts.mtx_rows",
"\t", escape_double = FALSE, col_names = FALSE,
trim_ws = TRUE)
metadata <- read_delim("/Users/ericabrophy/Documents/BMI511_translational/BreastCancerDataset/E-GEOD-75367.sdrf.txt",
"\t", escape_double = FALSE, trim_ws = TRUE)
Reshaping data
data <- as.matrix(data)
data_row <- data_row[, -1]
rownames(data)<-data_row$X2
colnames(data)<-data_col$X1
data.temp = data
data.temp[data.temp==0] = NA
hist(log10(as.vector(data.temp)+ 1))
Assign actual gene names to IDs
gene.df <- bitr(data_row$X2, fromType = "ENSEMBL",
toType = c("ENSEMBL", "SYMBOL"),
OrgDb = org.Hs.eg.db)
Pull out translated gene names
gene_symbol <- gene.df[,2]
Summary of the data
head(gene.df, n=5)
## ENSEMBL SYMBOL
## 1 ENSG00000000003 TSPAN6
## 2 ENSG00000000419 DPM1
## 3 ENSG00000000457 SCYL3
## 4 ENSG00000000460 C1orf112
## 5 ENSG00000000938 FGR
summary(gene.df, n=5)
## ENSEMBL SYMBOL
## Length:17265 Length:17265
## Class :character Class :character
## Mode :character Mode :character
Subsetting new data
intersection <- gene.df[,1] %in% rownames(data)
data[intersection, ]
data_subset <- data[gene.df[,1], ]
data_subset <- as.data.frame(data_subset)
Match gene labels for both dataframes
genomic_idx <- match(rownames(data_subset), gene.df[,1])
Renaming rows of new data matrix to gene names
rownames(data_subset) = make.names(gene.df$SYMBOL, unique=TRUE)
The Seurat object serves as a container that contains both data and analysis
BCdata <- CreateSeuratObject(counts = data_subset,
project = "BreastCancer",
min.cells = 3,
min.features = 200)
FeatureScatter(BCdata,"nCount_RNA","nFeature_RNA")
Check for MT genes -> QC
rownames(data_subset) = make.names(gene.df$SYMBOL, unique=TRUE)
head(grep(pattern = "^MT-", x = rownames(data_subset), value = TRUE))
BCdata[grep(pattern = "^MT-", x = rownames(data_subset))]
QC
BCdata[["percent.mt"]] <- PercentageFeatureSet(BCdata, pattern = "^MT-")
head(BCdata@meta.data, 5) #There are no MT genes in the dataset
## orig.ident nCount_RNA nFeature_RNA percent.mt
## BRx_111_1__10_08_15 BRx 2236252.6 5539 0
## BRx_111_1__10_19_15 BRx 2398740.9 5270 0
## BRx_111_2__10_08_15 BRx 828232.8 2165 0
## BRx_111_3__10_08_15 BRx 204546.6 3768 0
## BRx_129_2_111814 BRx 376391.6 5882 0
VlnPlot(BCdata, features = c("nFeature_RNA", "nCount_RNA", "percent.mt"), ncol = 3)
Normalize data
#multiplies this by a scale factor (10,000 by default), and log-transforms the result
BCdata <- NormalizeData(BCdata, normalization.method = "LogNormalize", scale.factor = 10000)
BCdata[["RNA"]]@data
## [[ suppressing 32 column names 'BRx_111_1__10_08_15', 'BRx_111_1__10_19_15', 'BRx_111_2__10_08_15' ... ]]
## [[ suppressing 32 column names 'BRx_111_1__10_08_15', 'BRx_111_1__10_19_15', 'BRx_111_2__10_08_15' ... ]]
## [[ suppressing 32 column names 'BRx_111_1__10_08_15', 'BRx_111_1__10_19_15', 'BRx_111_2__10_08_15' ... ]]
barplot(BCdata@meta.data$nFeature_RNA)
Feature Selection
Plot variable features with and without labels
# Identify the 10 most highly variable genes
top10 <- head(VariableFeatures(BCdata), 10)
LabelPoints(plot = VariableFeaturePlot(BCdata), points = top10, repel = TRUE)
Scaling the data
#apply a linear transformation (‘scaling’) that is a standard pre-processing
#step prior to dimensional reduction techniques like PCA.
all.genes <- rownames(BCdata)
BCdata <- ScaleData(BCdata, features = all.genes)
BCdata[["RNA"]]@scale.data
PCA
BCdata <- RunPCA(BCdata, features = VariableFeatures(object = BCdata), npcs = 30, verbose = FALSE, approx=FALSE)
print(BCdata[["pca"]], dims = 1:5, nfeatures = 5)
## PC_ 1
## Positive: TUBB1, RGS18, H2AC6, CAVIN2, CD226
## Negative: GGCT, NPM1, VDAC1, KRT7, UQCC2
## PC_ 2
## Positive: ASRGL1, KYNU, NRK, LOX, AACS
## Negative: CRABP2, HOXB2, S100A16, CALML5, FLYWCH2
## PC_ 3
## Positive: GABPB2, SAMHD1, ZNF37BP, KLRD1, SPG7
## Negative: PPBP, SCP2, SPARC, HPGD, SNN
## PC_ 4
## Positive: KYNU, KLRD1, NOA1, RPGRIP1, PLAC8
## Negative: EME1, IL17RB, FAM72D, WDHD1, DTL
## PC_ 5
## Positive: HBD, SLC4A1, CA1, HBM, AHSP
## Negative: TAPBP, NXPH1, FLYWCH2, EPHX2, MSH6
VizDimLoadings(BCdata, dims = 1:2, reduction = "pca")
DimPlot(BCdata, reduction = "pca")
DimHeatmap
#Easy exploration of the primary sources of heterogeneity in a dataset
#and can be useful when trying to decide which PCs to include for further downstream analyses
DimHeatmap(BCdata, dims = 1, cells = 500, balanced = TRUE)
DimHeatmap(BCdata, dims = 1:5, cells = 500, balanced = TRUE)
Jackstraw Plot
#a ranking of principle components based on the percentage of variance explained by each one
BCdata <- JackStraw(BCdata, num.replicate = 100)
BCdata <- ScoreJackStraw(BCdata, dims = 1:20)
JackStrawPlot(BCdata, dims = 1:15)
ElbowPlot(BCdata)
Clustering cells
BCdata <- FindNeighbors(BCdata, dims = 1:10)
BCdata <- FindClusters(BCdata, resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 74
## Number of edges: 1366
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.6831
## Number of communities: 2
## Elapsed time: 0 seconds
head(Idents(BCdata), 5)
## BRx_111_1__10_08_15 BRx_111_1__10_19_15 BRx_111_2__10_08_15 BRx_111_3__10_08_15
## 0 0 0 0
## BRx_129_2_111814
## 0
## Levels: 0 1
UMAP
BCdata <- RunUMAP(BCdata, dims = 1:10)
DimPlot(BCdata, reduction = "umap")
Find all markers for a cluster -> cluster 1
cluster1.markers <- FindMarkers(BCdata, ident.1 = 1, min.pct = 0.25)
head(cluster1.markers, n = 5)
## p_val avg_logFC pct.1 pct.2 p_val_adj
## GGCT 9.000078e-16 1.6405363 1.000 0.044 1.553864e-11
## MAL2 1.077508e-14 1.4468702 1.000 0.133 1.860318e-10
## SDC1 1.530493e-14 1.5576624 0.931 0.022 2.642396e-10
## UQCC2 3.380903e-14 0.6984841 0.966 0.067 5.837129e-10
## SNRPE 6.288319e-14 0.9916587 1.000 0.156 1.085678e-09
Find markers for every cluster compared to all remaining cells, report only the positive ones
BCdata.markers <- FindAllMarkers(BCdata, only.pos = TRUE, min.pct = 0.25, logfc.threshold = 0.25)
BCdata.markers %>% group_by(cluster) %>% top_n(n = 2, wt = avg_logFC)
## # A tibble: 4 x 7
## # Groups: cluster [2]
## p_val avg_logFC pct.1 pct.2 p_val_adj cluster gene
## <dbl> <dbl> <dbl> <dbl> <dbl> <fct> <chr>
## 1 0.00000159 3.07 0.889 0.759 0.0275 0 TUBB1
## 2 0.000305 3.14 0.911 0.897 1 0 PPBP
## 3 0.0000000680 2.94 0.759 0.244 0.00117 1 PIP
## 4 0.000000933 2.68 0.793 0.467 0.0161 1 APOD
The ROC test returns the ‘classification power’ for any individual marker (ranging from 0 - random, to 1 - perfect)
cluster1.markers <- FindMarkers(BCdata, ident.1 = 0, logfc.threshold = 0.25, test.use = "roc", only.pos = TRUE)
head(cluster1.markers, n = 5)
## myAUC avg_diff power pct.1 pct.2
## TAL1 0.935 1.830775 0.870 0.956 0.517
## LAT 0.917 1.885478 0.834 0.933 0.690
## SLC2A3 0.900 2.677472 0.800 0.933 0.724
## GMPR 0.900 1.330273 0.800 0.956 0.724
## CCL5 0.893 2.306072 0.786 1.000 0.897
Shows expression probability distributions across clusters
VlnPlot(BCdata, features = c("STX12", "HBA1"))
FeaturePlot(BCdata, features = c("STX12", "ATL3", "PTPA", "YIPF5", "MARCHF5", "HBD", "HBA1", "HBA2","HBB", "CA1"))
Generates an expression heatmap for given cells and features
top10 <- BCdata.markers %>% group_by(cluster) %>% top_n(n = 10, wt = avg_logFC)
DoHeatmap(BCdata, features = top10$gene) + NoLegend()
Naming clusters
new.cluster.ids <- c("HER2-", "HER2+")
names(new.cluster.ids) <- levels(BCdata)
BCdata <- RenameIdents(BCdata, new.cluster.ids)
plot <- DimPlot(BCdata, reduction = "umap", label = TRUE, pt.size = 0.5) + NoLegend()
HoverLocator(plot = plot, information = FetchData(BCdata, vars = c("ident", "PC_1", "nFeature_RNA")))
TSNE
BCdata <- RunTSNE(object = BCdata, dims = 1:2, resolution = 0.5, perplexity = 10)
plot <- DimPlot(object = BCdata, reduction = "tsne", label = TRUE)
HoverLocator(plot = plot, information = FetchData(BCdata, vars = c("ident", "PC_1", "nFeature_RNA")))
Dotplot of genes identified showing expression levels in the different clusters
features <- c("STX12", "ATL3", "PTPA", "YIPF5", "MARCHF5", "HBD", "HBA1", "HBA2","HBB", "CA1")
features2 <- c("APOD", "PIP", "CRABP2", "XBP1", "HSP90AB1", "F13A1", "TUBB1", "RGS18", "H2AC6", "CAVIN2", "CD226")
DotPlot(BCdata, features = features) + RotatedAxis()
DotPlot(BCdata, features = features2) + RotatedAxis()
Heatmap showing expression levels of genes identified in clusters
DoHeatmap(subset(BCdata, downsample = 100), features = features2, size = 3)
Interactive Plot to show NOTCH1 expression in data
plot <- FeaturePlot(BCdata, features = "NOTCH1")
HoverLocator(plot = plot, information = FetchData(BCdata, vars = c("ident", "PC_1", "nFeature_RNA")))
Shiny App URL
#https://ebrophy.shinyapps.io/SingleCellBreastCancer_Heatmap/